{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", "from sklearn.preprocessing import StandardScaler\n", "import matplotlib.pyplot as plt # plotting\n", "import numpy as np # linear algebra\n", "import os # accessing directory structure\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "import csv\n", "import re\n", "\n", "import jieba\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import gensim\n", "from gensim.models import Word2Vec\n", "from sklearn.preprocessing import scale\n", "import multiprocessing\n", "\n", "from snownlp import SnowNLP\n", "import jieba.analyse" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textclasspositive
index
018年结婚 哈哈哈00.900696
12017最后顿大餐吃完两人世界明年就是三个人一起啦许下生日愿望️希望一家人都能顺利平安健康🏻🏻🏻10.999904
2意盎然的季节!祝愿大家都生机勃勃,郁郁葱葱!20.736431
32017 遇见挚友 遇见我老公 结了婚有了小芒果 希望2018也超级美好️30.983905
42018.1.140.500000
52018加油!50.895319
62018年做一个更加真实的自己。️30.783433
72018年的第一天,完美的错过了一辆公交车。 德州60.934181
82018年目标1.赚钱买房2.谈场恋爱,遇到对的人就结婚3.拥有一副健康的身体4.学会一种乐...70.999799
92018年第一个假期:元旦,就这么过去了,感冒咳嗽发高烧给这个元旦带来了不一样的节日,好快呀...80.733896
\n", "
" ], "text/plain": [ " text class positive\n", "index \n", "0 18年结婚 哈哈哈 0 0.900696\n", "1 2017最后顿大餐吃完两人世界明年就是三个人一起啦许下生日愿望️希望一家人都能顺利平安健康🏻🏻🏻 1 0.999904\n", "2 意盎然的季节!祝愿大家都生机勃勃,郁郁葱葱! 2 0.736431\n", "3 2017 遇见挚友 遇见我老公 结了婚有了小芒果 希望2018也超级美好️ 3 0.983905\n", "4 2018.1.1 4 0.500000\n", "5 2018加油! 5 0.895319\n", "6 2018年做一个更加真实的自己。️ 3 0.783433\n", "7 2018年的第一天,完美的错过了一辆公交车。 德州 6 0.934181\n", "8 2018年目标1.赚钱买房2.谈场恋爱,遇到对的人就结婚3.拥有一副健康的身体4.学会一种乐... 7 0.999799\n", "9 2018年第一个假期:元旦,就这么过去了,感冒咳嗽发高烧给这个元旦带来了不一样的节日,好快呀... 8 0.733896" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dff = pd.read_csv(\"C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/train.csv\",index_col=0)\n", "dff['text'] = dff['text'].fillna('')\n", "dff.head(10)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "[' ', ' ', '~', '。', ',', '…', '~', '!', '\"', '#', '$', '%', '&', \"'\", '(', ')', '*', '+', ',', '-', '--', '.', ':', '://', '::', ';', '<', '=', '>', '>>', '?', '@', 'A', 'Lex', '', '\\\\', '', '^', '_', '`', 'exp', 'sub', 'sup', '|', '}', '~', '~~~~', '·', '×', '×××', 'Δ', 'Ψ', 'γ', 'μ', 'φ', 'φ.', 'В', '—', '——', '———', '‘', '’', '’‘', '“', '”', '”,', '…', '……', '…………………………………………………③', '′∈', '′|', '℃', 'Ⅲ', '↑', '→', '∈', '∪φ∈', '≈', '①', '②', '②c', '③', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '──', '■', '▲', '\\u3000', '、', '。', '〈', '〉', '《', '》']\n" ] } ], "source": [ "def stopwordslist():\n", " f = open(\"C:/Users/Kai/Desktop/stop.txt\", \"r\")\n", " line = f.readline()\n", " stopwords = []\n", " index = 0\n", " while line:\n", " if index % 1000 == 0:\n", " print(index)\n", " index += 1\n", " line = line.replace('\\n', '')\n", " line = line.replace('[', '')\n", " line = line.replace(']', '')\n", " line = line.replace(']', '')\n", " line = line.replace('[', '')\n", " \n", " stopwords.append(line)\n", " line = f.readline()\n", "\n", " print(stopwords[:100])\n", " return stopwords\n", "\n", "# 创建一个停用词列表\n", "stopwords = stopwordslist()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# 对句子进行中文分词\n", "def seg_depart(sentence):\n", " # 对文档中的每一行进行中文分词\n", " sentence_depart = jieba.cut(sentence.strip())\n", " # 输出结果为outstr\n", " outstr = ''\n", " # 去停用词\n", " for word in sentence_depart:\n", " if word not in stopwords:\n", " if word != '\\t':\n", " outstr += word\n", " outstr += \" \"\n", " return outstr" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "clas = dff['class'].values" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textclasspositive
index
0我是正面哦00.347826
1爱是恒久忍耐,又有恩慈。爱是不嫉妒,不自夸,不张狂,不轻易发怒。不计算人的恶。凡事包容。凡事...00.496333
2讨厌死了,上班上班上班不停的上班我真的超级累。什么都不干还是超级超级累。00.000422
3矮马大半夜的放肌肉男不让人睡觉了00.409895
4谢谢陈先生。00.768959
5我的2016要早点睡别熬夜00.625607
6周锐锐哥!爱你00.970187
7塞尼亚岛00.500000
8只可惜没能去现场00.100791
9自从发现这个号都处于一种忍不住不看看了睡不着的状态00.355194
\n", "
" ], "text/plain": [ " text class positive\n", "index \n", "0 我是正面哦 0 0.347826\n", "1 爱是恒久忍耐,又有恩慈。爱是不嫉妒,不自夸,不张狂,不轻易发怒。不计算人的恶。凡事包容。凡事... 0 0.496333\n", "2 讨厌死了,上班上班上班不停的上班我真的超级累。什么都不干还是超级超级累。 0 0.000422\n", "3 矮马大半夜的放肌肉男不让人睡觉了 0 0.409895\n", "4 谢谢陈先生。 0 0.768959\n", "5 我的2016要早点睡别熬夜 0 0.625607\n", "6 周锐锐哥!爱你 0 0.970187\n", "7 塞尼亚岛 0 0.500000\n", "8 只可惜没能去现场 0 0.100791\n", "9 自从发现这个号都处于一种忍不住不看看了睡不着的状态 0 0.355194" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfTest = pd.read_csv(\"C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/test.csv\",index_col=0)\n", "dfTest['text'] = dfTest['text'].fillna('')\n", "dfTest.head(10)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Loading model from cache C:\\Users\\Kai\\AppData\\Local\\Temp\\jieba.cache\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loading model cost 0.879 seconds.\n", "Prefix dict has been built succesfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "100000\n", "200000\n", "300000\n", "400000\n", "500000\n", "600000\n", "700000\n", "800000\n" ] } ], "source": [ "# 分词\n", "sen = dff['text'].values\n", "\n", "for i in range(len(sen)):\n", " if i % 100000 == 0:\n", " print(i)\n", " sen[i] = seg_depart(sen[i])\n", " \n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "100000\n" ] } ], "source": [ "senTest = dfTest['text'].values\n", "\n", "for i in range(len(senTest)):\n", " if i % 100000 == 0:\n", " print(i)\n", " senTest[i] = seg_depart(senTest[i])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['我 是 正面 哦 '\n", " '爱是 恒久 忍耐 又 有恩慈 爱是 不嫉妒 不 自夸 不 张狂 不 轻易 发怒 不 计算 人 的 恶 凡事 包容 凡事 相信 凡事 盼望 凡事 忍耐 爱是 永不 止息 '\n", " '讨厌 死 了 上班 上班 上班 不停 的 上班 我 真的 超级 累 什么 都 不 干 还是 超级 超级 累 '\n", " '矮马 大半夜 的 放 肌肉男 不让 人 睡觉 了 ' '谢谢 陈先生 ' '我 的 2016 要 早点 睡别 熬夜 ' '周锐 锐哥 爱 你 '\n", " '塞 尼亚岛 ' '只 可惜 没能 去 现场 ' '自从 发现 这个 号 都 处于 一种 忍不住 不 看看 了 睡不着 的 状态 '\n", " '真系 咁 钟意 音乐 咩 '\n", " '感恩 2 续 他们 都 会 过 得 很 幸福 甜蜜 爸爸 的 身体 也 越来越 健壮 健康 妈妈 也 越来越 温柔 越 女人 我 自己 也 越来越 漂亮 皮肤 好好 非常 水润 皮肤 非常 光滑 我 弟弟 也 越来越 帅 越来越 思想 成熟 做事 非常 稳重 也 越来越 让 家人 开心 在 南昌 明年 一定 会 有 到 我 的 单身公寓 我 明年 一定 会 拿到 我 的 粉车 '\n", " '迷尚 自然 的 主页 ' '问叹 女王 权杖 口红 我 最 爱 的 口红 是 口红 又 是 装饰品 '\n", " '有个 顺序 得 先 读书 然后 才能 多 走走 否则 行再 多路 也 是 个 邮差 音乐 也 是 一样 我 倒 是 也 想 施施然 上台 去 可是 要 被 踹 下来 的 呀 预祝 巡演 成功 '\n", " '年终 福利 ' '声音 好好 听 '\n", " '少年 迪玛希 谁家 翩翩少年 郎 横空出世 迷人眼 着 调 专访 少年 迪玛希 谁家 翩翩少年 郎 横空出世 迷人眼 着 调 专访 '\n", " '喜欢 的 紫薯 甜品店 来 了 ' '我 不是 好惹 的 第 12 名 ' '一天 比 一天 像 公主 梦 都 被 满足 '\n", " '有 你 在 身边 很 心安 去 校医 室 有人 陪 去 体检 有人 陪 干什么 你 都 在 很快 又 不累 '\n", " '果然 全世界 女孩子 都 是 一样 的 ... 这 看 脸 的 世界 '\n", " '11 月 7 日 20 00 上 新 开拍 亲们 来 捧场 哦 上 新 当晚 有 给 力 优惠 还有 神秘 福袋 哦 '\n", " '吉林 百嘉 门将 原 国家 沙滩 足球队 主力 门将 温廷元 扑出 了 对方 王凯 的 点球 ' '午间 运动 ' '湖南 张家界 天门山 '\n", " '一杯 红酒 一盘 残羹剩饭 几块 蛋糕 当做 大餐 我 肯定 醉 了 ' '萌萌 哒 的 我 '\n", " '发现 一些 古懂 你们 以前 是 用 这种 真正 的 幻灯片 的 吗 ']\n" ] } ], "source": [ "print(senTest[:30])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# from sklearn.model_selection import train_test_split\n", "# X_train, X_test, y_train, y_test = train_test_split(sen, clas, test_size=0.1, random_state=42) ######" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)\n", "# transformer = TfidfTransformer()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# tf_X_train = vectorizer.fit_transform(X_train)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# tf_X_test = vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# tf_Test = vectorizer.transform(senTest)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB\n", "# maxxi = 0\n", "# maxxscore = 0\n", "# for i in np.arange(10, 20, 0.5):\n", "# mnb = ComplementNB(alpha=i)\n", "# mnb.fit(tf_X_train, y_train)\n", "# print(mnb.score(tf_X_test,y_test), i)\n", "# if maxxscore < mnb.score(tf_X_test,y_test):\n", "# maxxscore = mnb.score(tf_X_test,y_test)\n", "# maxxi = i\n", "\n", "# print(maxxscore, maxxi)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# mnb = ComplementNB(alpha=11.5)\n", "# mnb.fit(tf_X_train, y_train)\n", "# print(mnb.score(tf_X_test,y_test), 0.1)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# pred = mnb.predict(tf_Test)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# csvFile = open('C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/last0.csv','w', newline='', encoding='UTF-8') # 设置newline,否则两行之间会空一行\n", "# writer = csv.writer(csvFile)\n", "\n", "# writer.writerow(['ID', 'Expected'])\n", "# for i in range(len(pred)):\n", "# if i % 50000 == 0:\n", "# print(i)\n", "# writer.writerow([int(i), int(pred[i])])\n", " \n", "# csvFile.close()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# from sklearn.svm import LinearSVC\n", "# model = LinearSVC(penalty='l1', dual=False, tol=1e-3)\n", "# model.fit(tf_X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# print(model.score(tf_X_test,y_test))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "E:\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", " from ._conv import register_converters as _register_converters\n", "Using TensorFlow backend.\n", "E:\\Anaconda3\\lib\\site-packages\\keras_preprocessing\\text.py:178: UserWarning: The `nb_words` argument in `Tokenizer` has been renamed `num_words`.\n", " warnings.warn('The `nb_words` argument in `Tokenizer` '\n" ] } ], "source": [ "# libraries\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "import matplotlib.pyplot as plt\n", "np.random.seed(32)\n", "\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import roc_auc_score\n", "from sklearn.manifold import TSNE\n", "\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout\n", "from keras.utils.np_utils import to_categorical\n", "\n", "\n", "%matplotlib inline\n", "MAX_NB_WORDS = 20000\n", "# finally, vectorize the text samples into a 2D integer tensor\n", "tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "tokenizer.fit_on_texts(sen)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "sequences = tokenizer.texts_to_sequences(sen)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "sequences_test = tokenizer.texts_to_sequences(senTest)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "MAX_SEQUENCE_LENGTH = 300\n", "\n", "# pad sequences with 0s\n", "x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", "x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textpositive01234567...62636465666768697071
index
0 18 年 结婚 哈哈哈0.90069610000000...0000000000
12017 最后 顿 大餐 吃 完 两人 世界 明年 就是 三个 人 一起 啦 许下 生日 愿...0.99990401000000...0000000000
2意 盎然 的 季节 祝愿 大家 都 生机勃勃 郁郁葱葱0.73643100100000...0000000000
32017 遇见 挚友 遇见 我 老公 结了婚 有 了 小 芒果 希望 2018 也 超级 美...0.98390500010000...0000000000
42018.1 10.50000000001000...0000000000
\n", "

5 rows × 74 columns

\n", "
" ], "text/plain": [ " text positive 0 1 2 \\\n", "index \n", "0  18 年 结婚 哈哈哈 0.900696 1 0 0 \n", "1 2017 最后 顿 大餐 吃 完 两人 世界 明年 就是 三个 人 一起 啦 许下 生日 愿... 0.999904 0 1 0 \n", "2 意 盎然 的 季节 祝愿 大家 都 生机勃勃 郁郁葱葱 0.736431 0 0 1 \n", "3 2017 遇见 挚友 遇见 我 老公 结了婚 有 了 小 芒果 希望 2018 也 超级 美... 0.983905 0 0 0 \n", "4 2018.1 1 0.500000 0 0 0 \n", "\n", " 3 4 5 6 7 ... 62 63 64 65 66 67 68 69 70 71 \n", "index ... \n", "0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "3 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "4 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "\n", "[5 rows x 74 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "one_hot = pd.get_dummies(dff[\"class\"])\n", "dff.drop(['class'], axis=1, inplace=True)\n", "dff = pd.concat([dff,one_hot], axis=1)\n", "dff.head()" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "# a, b, yTrain, yTest = train_test_split(sen, y_train, test_size=0.1, random_state=42) ######" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1 0 0 ... 0 0 0]\n", " [0 1 0 ... 0 0 0]\n", " [0 0 1 ... 0 0 0]\n", " ...\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]]\n" ] } ], "source": [ "y_train = dff.drop(['text', 'positive'],axis=1).values\n", "print(y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 下面的cell是 分数为 0.7+ 的模型源代码\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from keras.layers import Dense, Input, Flatten\n", "from keras.layers import GlobalAveragePooling1D, Embedding\n", "from keras.models import Model\n", "\n", "EMBEDDING_DIM = 100\n", "N_CLASSES = 72\n", "\n", "# input: a sequence of MAX_SEQUENCE_LENGTH integers\n", "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", "\n", "embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM,\n", " input_length=MAX_SEQUENCE_LENGTH,\n", " trainable=True)\n", "embedded_sequences = embedding_layer(sequence_input)\n", "\n", "average = GlobalAveragePooling1D()(embedded_sequences)\n", "predictions = Dense(N_CLASSES, activation='softmax')(average)\n", "\n", "model = Model(sequence_input, predictions)\n", "model.compile(loss='categorical_crossentropy',\n", " optimizer='adam', metrics=['acc'])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "input_1 (InputLayer) (None, 300) 0 \n", "_________________________________________________________________\n", "embedding_1 (Embedding) (None, 300, 100) 2000000 \n", "_________________________________________________________________\n", "global_average_pooling1d_1 ( (None, 100) 0 \n", "_________________________________________________________________\n", "dense_1 (Dense) (None, 72) 7272 \n", "=================================================================\n", "Total params: 2,007,272\n", "Trainable params: 2,007,272\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/2\n", "776847/776847 [==============================] - 464s 597us/step - loss: 3.7645 - acc: 0.1069 - val_loss: 3.7131 - val_acc: 0.1204\n", "Epoch 2/2\n", "776847/776847 [==============================] - 437s 562us/step - loss: 3.6407 - acc: 0.1374 - val_loss: 3.6173 - val_acc: 0.1411\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=2, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/2\n", "776847/776847 [==============================] - 443s 570us/step - loss: 3.5505 - acc: 0.1555 - val_loss: 3.5596 - val_acc: 0.1546\n", "Epoch 2/2\n", "776847/776847 [==============================] - 447s 576us/step - loss: 3.4915 - acc: 0.1659 - val_loss: 3.5098 - val_acc: 0.1678\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=2, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 449s 578us/step - loss: 3.4495 - acc: 0.1732 - val_loss: 3.4994 - val_acc: 0.1659\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "res = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "pred = model.predict(res)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "result = np.argmax(pred, axis = 1)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "50000\n", "100000\n", "150000\n" ] } ], "source": [ "# 写入文件\n", "csvFile = open('C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/1.csv','w', newline='', encoding='UTF-8') # 设置newline,否则两行之间会空一行\n", "writer = csv.writer(csvFile)\n", "\n", "writer.writerow(['ID', 'Expected'])\n", "for i in range(len(result)):\n", " if i % 50000 == 0:\n", " print(i)\n", " writer.writerow([int(i), int(result[i])])\n", " \n", "csvFile.close()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "model.save('my_model_1.h5')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 232s 299us/step - loss: 3.4180 - acc: 0.1784 - val_loss: 3.4784 - val_acc: 0.1712\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=256)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "50000\n", "100000\n", "150000\n" ] } ], "source": [ "pred = model.predict(res)\n", "result = np.argmax(pred, axis = 1)\n", "\n", "# 写入文件\n", "csvFile = open('C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/2.csv','w', newline='', encoding='UTF-8') # 设置newline,否则两行之间会空一行\n", "writer = csv.writer(csvFile)\n", "\n", "writer.writerow(['ID', 'Expected'])\n", "for i in range(len(result)):\n", " if i % 50000 == 0:\n", " print(i)\n", " writer.writerow([int(i), int(result[i])])\n", " \n", "csvFile.close()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "model.save(\"2.h5\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 250s 322us/step - loss: 3.3982 - acc: 0.1813 - val_loss: 3.4675 - val_acc: 0.1734\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=256)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "50000\n", "100000\n", "150000\n" ] } ], "source": [ "pred = model.predict(res)\n", "result = np.argmax(pred, axis = 1)\n", "\n", "# 写入文件\n", "csvFile = open('C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/3.csv','w', newline='', encoding='UTF-8') # 设置newline,否则两行之间会空一行\n", "writer = csv.writer(csvFile)\n", "\n", "writer.writerow(['ID', 'Expected'])\n", "for i in range(len(result)):\n", " if i % 50000 == 0:\n", " print(i)\n", " writer.writerow([int(i), int(result[i])])\n", " \n", "csvFile.close()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 505s 650us/step - loss: 3.3785 - acc: 0.1845 - val_loss: 3.4588 - val_acc: 0.1735\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/2\n", "776847/776847 [==============================] - 630s 811us/step - loss: 3.3545 - acc: 0.1882 - val_loss: 3.4532 - val_acc: 0.1756\n", "Epoch 2/2\n", "776847/776847 [==============================] - 606s 780us/step - loss: 3.3336 - acc: 0.1918 - val_loss: 3.4609 - val_acc: 0.1732\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=2, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 503s 647us/step - loss: 3.3144 - acc: 0.1953 - val_loss: 3.4599 - val_acc: 0.1736\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 499s 642us/step - loss: 3.2965 - acc: 0.1981 - val_loss: 3.4537 - val_acc: 0.1725\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/2\n", "776847/776847 [==============================] - 508s 654us/step - loss: 3.2792 - acc: 0.2008 - val_loss: 3.4511 - val_acc: 0.1769\n", "Epoch 2/2\n", "776847/776847 [==============================] - 497s 640us/step - loss: 3.2625 - acc: 0.2034 - val_loss: 3.4543 - val_acc: 0.1739\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=2, batch_size=128)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 157s 202us/step - loss: 3.2409 - acc: 0.2078 - val_loss: 3.4523 - val_acc: 0.1768\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=512)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 776847 samples, validate on 86317 samples\n", "Epoch 1/1\n", "776847/776847 [==============================] - 179s 230us/step - loss: 3.2358 - acc: 0.2087 - val_loss: 3.4597 - val_acc: 0.1725\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(x_train, y_train, validation_split=0.1, epochs=1, batch_size=512)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "50000\n", "100000\n", "150000\n" ] } ], "source": [ "pred = model.predict(res)\n", "result = np.argmax(pred, axis = 1)\n", "\n", "# 写入文件\n", "csvFile = open('C:/Users/Kai/Desktop/171840708_IntroDM_MiningChallenge/mining-challenge-for-nju-introdm-2019/Mining Challenge Dataset/3.csv','w', newline='', encoding='UTF-8') # 设置newline,否则两行之间会空一行\n", "writer = csv.writer(csvFile)\n", "\n", "writer.writerow(['ID', 'Expected'])\n", "for i in range(len(result)):\n", " if i % 50000 == 0:\n", " print(i)\n", " writer.writerow([int(i), int(result[i])])\n", " \n", "csvFile.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }